* Settings
version 16
do "$SSDIMed/scripts/_auxiliary/_project_settings.do"

* Run as batch; auto-creates *.log file with same base name as the *.do file
* > statamp -b 03b_build_analysis_samples.do & 

* -
* -----------------------------------------------------------------------------------------------
*From the master panel, create files for the sample and units of analysis
* -----------------------------------------------------------------------------------------------

* Data A. DI entrant outcomes
*   Observations: bene year

* Load data
use "$SSDIMed/data/temp/master_cohort_panel.dta", clear 
gisid bene_id rfrnc_yr
assert inrange(age_year_covstart_fill, 20, 64)

* Select the sample
*   1) In DIB sample, = inlist(1, dib_bic_init, orec_dib_any_ever)
*   2) Age 20-62 at time of Medicare eligibility
*   3) Annual observations up through year of death
*   4) observations after initial (usually partial) year
keep if dib_sample == 1
keep if inrange(age_year_covstart_fill, 20, 62)
keep if rfrnc_yr <= year(death_dt_firstnm)
keep if (rfrnc_yr > init_yr)
gisid bene_id rfrnc_yr

* Health care use variables 
gen ip_cov_days = acute_cov_days + oip_cov_days
gen ip_stays = acute_stays + oip_stays
local use_vars ip_cov_days ip_stays ip_er_visits snf_cov_days hos_cov_days hh_visits

* QC: health care use variables are missing iff tot_pmt is missing
foreach var in `use_vars' {
  assert missing(`var') == missing(tot_pmt)
}

* Fixed effects groups
gegen fipscounty_firstnm_g = group(fipscounty_firstnm)
gegen county_years_enrolled = group(fipscounty_firstnm years_since_covstart)
label var fipscounty_firstnm_g "group(fipscounty_firstnm)"
label var county_years_enrolled "group(fipscounty_firstnm years_since_covstart)"

* Clustering
gegen county_year = group(fipscounty_firstnm covstart_year)
gegen county_mofd = group(fipscounty_firstnm covstart_month)
label var county_year "group(fipscounty_firstnm covstart_year)"
label var county_mofd "group(fipscounty_firstnm covstart_month)"

* Save data to be used in the analysis
gisid bene_id rfrnc_yr
save "$SSDIMed/data/analysis/bene-year_outcomes_sample-main.dta", replace

* Sample analyses 
if 0 {

use "$SSDIMed/data/analysis/bene-year_outcomes_sample-main.dta", clear
assert dib_sample == 1
assert inrange(age_year_covstart_fill, 20, 62)
assert inrange(rfrnc_yr, init_yr + 1, year(death_dt_firstnm))

local y tot_pmt
* local y died

di _newline "============================================================================================="
di          "Section 1"
di          "============================================================================================="
_regress `y' i.covstart_year,                       a(county_years_enrolled)
_regress `y' i.covstart_year i.age,                 a(county_years_enrolled)
_regress `y' i.covstart_year i.age_init,            a(county_years_enrolled)

di _newline "============================================================================================="
di          "Section 2"
di          "============================================================================================="
_regress `y' unemp_rate_county_atapp,               a(county_years_enrolled)
_regress `y' unemp_rate_county_atapp i.age,         a(county_years_enrolled)
_regress `y' unemp_rate_county_atapp i.age_init,    a(county_years_enrolled)

di _newline "============================================================================================="
di          "Section 3"
di          "============================================================================================="
_regress `y' i.unemp_rate_q20_atapp,                a(county_years_enrolled)
_regress `y' i.unemp_rate_q20_atapp i.age,          a(county_years_enrolled)
_regress `y' i.unemp_rate_q20_atapp i.age_init,     a(county_years_enrolled)

}


* -----------------
* Data B. DI entry 
*   Observations: (a) county month age, and (b) county month

* Load data
use "$SSDIMed/data/temp/master_cohort_panel.dta", clear 
gisid bene_id rfrnc_yr
assert inrange(age_year_covstart_fill, 20, 64)

* Select the sample
*   1) In DIB sample, = inlist(1, dib_bic_init, orec_dib_any_ever)
*   2) Age 20-62 at time of Medicare eligibility
*   3) Annual observations up through year of death
*   4) Initial year of observation
keep if dib_sample == 1
keep if inrange(age_year_covstart_fill, 20, 62)
keep if rfrnc_yr <= year(death_dt_firstnm)
keep if (rfrnc_yr == init_yr)
gisid bene_id

* (a) Collapse to county months, by gender and age at entry
local vars_first *_atapp
foreach var of varlist `vars_first' {
  noisily di "`var'"
  bys fipscounty_firstnm covstart_month: assert `var' == `var'[1]
}
gen double count = 1
*local vars_sum count esrd_init orec_dib_* bic_primary
local vars_sum count 
gcollapse (first) `vars_first' (sum) `vars_sum', by(fipscounty_firstnm covstart_month age_year_covstart_fill male_init) fast labelformat(#sourcelabel#)
label var count "Number of individuals in the collapsed cell"

* Rectangularize data to account for observations with no entry
fillin fipscounty_firstnm covstart_month age_year_covstart_fill male_init
foreach var of varlist `vars_sum' {
  replace `var' = 0 if _fillin==1 
}
assert missing(unemp_rate_county_atapp, unemp_rate_national_atapp) if (_fillin==1)
drop _fillin

* Add in
*   - Unemployment rates for obs with no entry (and confirm that this has no impact on unemp_rate_county_atapp values for obs with entry)
rename *_atapp *_atapp_v1
gen fipscounty = fipscounty_firstnm
merge m:1 fipscounty covstart_month using "$SSDIMed/data/proc/public/unemp_pop_atapp.dta", keep(master match) nogen noreport keepusing(unemp*_atapp pop_19_61_atapp)
drop fipscounty
foreach var of varlist unemp*_atapp {
  di "`var'"
  assert `var' == `var'_v1 if !missing(`var'_v1)
}
drop *atapp_v1 

* Add in 
*   - Population counts by gender and age in months of filing ("atapp")
*       Medicare data tell us age age at Medicare coverage
*       Calculate age "atapp" under a gap of 2 years between SSDI filing and Medicare coverage
gen age_atapp = age_year_covstart_fill - 2
gen     gender = "m" if male_init == 1
replace gender = "f" if male_init == 0
gen fipscounty = fipscounty_firstnm
bys fipscounty covstart_month gender age_atapp: assert _N == 1
merge 1:1 fipscounty covstart_month gender age_atapp using "$SSDIMed/data/proc/public/popgender_atapp_long.dta", keep(master match) nogen noreport
drop fipscounty gender age_atapp 
rename pop_atapp pop_agegender_atapp
label var pop_agegender_atapp "Gender-age (age at Medicare covstart - 2) population at time of SSDI application"

* Convert counts to incidence per 100,000 population (total county population in month of application)
* foreach sum_var of varlist `vars_sum' {
*   foreach pop_var in pop_age {
*     gen double incidence_`sum_var'_`pop_var' = 10^6 * `sum_var' / `pop_var'_atapp_gap1yr
*     label var incidence_`sum_var'_`pop_var' "Incidence in terms of `sum_var' per million `pop_var'_atapp_gap1yr"
*   }
* }
gen double incidence_pop_agegender_atapp = 10^6*count/pop_agegender_atapp
label var incidence_pop_agegender_atapp "Incidence = count per million pop_agegender_atapp"

* Reconstruct date variables
gen covstart_year = year(dofm(covstart_month))
assert inrange(covstart_year, 1993, 2017)
order covstart_year, after(covstart_month)

* Reconstruct group variables
gegen fipscounty_firstnm_g = group(fipscounty_firstnm)
gegen county_year = group(fipscounty_firstnm covstart_year)
gegen county_mofd = group(fipscounty_firstnm covstart_month)
label var fipscounty_firstnm_g "group(fipscounty_firstnm)"
label var county_year "group(fipscounty_firstnm covstart_year)"
label var county_mofd "group(fipscounty_firstnm covstart_month)"

* Save data on entry by age and gender (for supplemental analysis)
bys fipscounty_firstnm covstart_month age_year_covstart_fill male_init: assert _N == 1
compress 
save "$SSDIMed/data/analysis/county-month-agegender_entry_sample-main.dta", replace

* Aggregate across gender to get incidence by age (for main analysis)
use "$SSDIMed/data/analysis/county-month-agegender_entry_sample-main.dta", clear
local first_vars fipscounty_firstnm_g county_* unemp* covstart_year pop_19_61_atapp
gcollapse (first) `first_vars' (nansum) count pop_age_atapp = pop_agegender_atapp, by(fipscounty_firstnm covstart_month age_year_covstart_fill) labelformat(#sourcelabel#)
gen double incidence_pop_age_atapp = 10^6*count/pop_age_atapp
label var incidence_pop_age_atapp "Incidence = count per million pop_age_atapp"
bys fipscounty_firstnm covstart_month age_year_covstart_fill: assert _N == 1
compress 
save "$SSDIMed/data/analysis/county-month-age_entry_sample-main.dta", replace


* (b) Collapse to county months (aggregate across ages)
* OLD CODE: if decide to aggregate across ages, will need to tweak slightly to run 
if 0 {
  local vars_first unemp_*_atapp pop_all_atapp pop_18_60_atapp covstart_year fipscounty_firstnm_g county_year county_mofd
  foreach var of varlist `vars_first' {
    di "`var'"
    by fipscounty_firstnm covstart_month: assert `var' == `var'[1]
  }
  gcollapse (first) `vars_first' (nansum) count inc_*_pop_all inc_*_pop_18_60, by(fipscounty_firstnm covstart_month) fast labelformat(#sourcelabel#)
  assert abs(inc_count_pop_18_60 - 10^6*count/pop_18_60_atapp) < 1e-9 if !missing(inc_count_pop_18_60)
  save "$SSDIMed/data/analysis/county-month_entry_sample-main.dta", replace
  
  
  * Sample analyses 
  if 0 {
  
  use "$SSDIMed/data/analysis/county-month_entry_sample-main.dta", clear
  _regress inc_count_pop_all   unemp_rate_county_atapp [aw = pop_all_atapp ], a(fipscounty_firstnm)
  _regress inc_count_pop_18_60 unemp_rate_county_atapp [aw = pop_18_60_atapp], a(fipscounty_firstnm)
  
  local pop pop_all
  _regress inc_esrd_init_`pop' unemp_rate_county_atapp [aw = `pop'_atapp], a(fipscounty_firstnm)
  _regress inc_orec_dib_any_init_`pop' unemp_rate_county_atapp [aw = `pop'_atapp], a(fipscounty_firstnm)
  
  
  
  use "$SSDIMed/data/analysis/county-month-age_entry_sample-main.dta", clear
  . gegen county_age = group(fipscounty_firstnm age_year_covstart_fill )
  
  * No effect on ESRD
  _regress inc_esrd_init_pop_age unemp_rate_county_atapp [aw = pop_age_atapp ], a(fipscounty_firstnm)
  
  * Big effect on DIB (ever or initial)
  _regress inc_orec_dib_any_ever_pop_age unemp_rate_county_atapp [aw = pop_age_atapp ], a(fipscounty_firstnm)
  _regress inc_orec_dib_any_init_pop_age unemp_rate_county_atapp [aw = pop_age_atapp ], a(fipscounty_firstnm)
  
  * Smaller (2.5%) effect ages 21-51, bigger (5.2%) effect ages 52-62
  _regress inc_orec_dib_any_ever_pop_age unemp_rate_county_atapp if inrange(age_year_covstart_fill, 21, 51) [aw = pop_age_atapp ], a(fipscounty_firstnm)
  _regress inc_orec_dib_any_ever_pop_age unemp_rate_county_atapp if inrange(age_year_covstart_fill, 52, 62) [aw = pop_age_atapp ], a(fipscounty_firstnm)
  
  * Ventiles
  
  }
}



** EOF


